So you want to know the range of products being sold by your competitor. You go to their website and see all the products (along with the details) and want to compare it with your own range of products. Great! How do you do that? How do you get the details available on the website into a format in which you can analyse it?
Hmmm.. If you have these or similar questions on your mind, you have come to the right place. In this post, we will learn about web scraping using R. If you like a more structured approach, try our free online course, Web Scraping with R.
So, what exactly is web scraping or web mining or web harvesting? It is a technique for extracting data from websites. Remember, websites contain wealth of useful data but designed for human consumption and not data analysis. The goal of web scraping is to take advantage of the pattern or structure of web pages to extract and store data in a format suitable for data analysis.
To be able to scrape data from websites, we need to understand how the web pages are structured. In this section, we will learn enough about HTML to be able to start scraping data from websites.
library(robotstxt)
library(rvest)
library(selectr)
library(xml2)
library(dplyr)
library(stringr)
library(forcats)
library(magrittr)
library(tidyr)
library(ggplot2)
library(lubridate)
library(tibble)
library(purrr)
paths_allowed(
paths = c("https://www.amazon.in/mobile-phones/b?ie=UTF8&node=1389401031&ref_=nav_shopall_sbc_mobcomp_all_mobiles")
)
##
www.amazon.in No encoding supplied: defaulting to UTF-8.
## [1] TRUE
top_phones <- read_html("https://www.amazon.in/mobile-phones/b?ie=UTF8&node=1389401031&ref_=nav_shopall_sbc_mobcomp_all_mobiles")
top_phones
## {xml_document}
## <html class="a-no-js" data-19ax5a9jf="dingo">
## [1] <head>\n<script>var aPageStart = (new Date()).getTime();</script><me ...
## [2] <body class="a-aui_149818-c a-aui_152852-c a-aui_157141-c a-aui_1586 ...
top_phones %>%
html_nodes(".crwTitle a") %>%
html_text() %>%
str_split('\\(') %>%
map_chr(1) %>%
str_trim() -> mobile_name
mobile_name
## [1] "Samsung Galaxy M30" "Samsung Galaxy M30" "Redmi 6 Pro"
## [4] "Redmi 6A" "Samsung Galaxy M20" "Samsung Galaxy M30"
## [7] "Samsung Galaxy M10" "Redmi 6 Pro" "Mi Redmi Y2"
## [10] "OnePlus 6T"
top_phones %>%
html_nodes(".crwTitle a") %>%
html_text() %>%
str_split('\\(') %>%
map_chr(2) %>%
str_split(",") %>%
map_chr(1) -> mobile_color
mobile_color
## [1] "Gradation Black" "Gradation Blue" "Black"
## [4] "Black" "Ocean Blue" "Gradation Black"
## [7] "Charcoal Black" "Black" "Black"
## [10] "Mirror Black"
top_phones %>%
html_nodes(".crwProductDetail span .a-icon-alt") %>%
html_text() %>%
str_sub(start = 1, end = 3) %>%
as.numeric() -> mobile_rating
mobile_rating
## [1] 4.1 4.1 4.2 4.2 3.7 4.3 4.1 4.1 4.3 4.5
top_phones %>%
html_nodes(".crwProductDetail span.a-size-small .a-link-normal") %>%
html_text() %>%
str_replace(",", "") %>%
as.numeric() -> mobile_review
mobile_review
## [1] 496 1003 6334 23230 11616 284 2843 14680 18039 14240
top_phones %>%
html_nodes(".crwProductDetail .crwPrice .a-text-strike") %>%
html_text() %>%
str_trim() %>%
str_sub(start = 5) %>%
append(NA, 9) %>%
str_replace(",", "") %>%
str_split("\\.") %>%
map_chr(1) %>%
as.numeric() -> real_price
real_price
## [1] 16490 16490 11499 6999 13390 19590 9290 13499 10499 NA
top_phones %>%
html_nodes(".crwProductDetail .crwActualPrice") %>%
html_text() %>%
str_trim() %>%
str_sub(start = 5) %>%
str_replace(",", "") %>%
str_split("\\.") %>%
map_chr(1) %>%
as.numeric() -> discounted_price
discounted_price
## [1] 14990 14990 8999 5999 12990 17990 8990 10999 8999 37999
best_sellers <- tibble(title = mobile_name, color = mobile_color,
rating = mobile_rating, reviews = mobile_review, `Real Price (Rs.)` = real_price,
`Discount Price (Rs.)` = discounted_price)
best_sellers
## # A tibble: 10 x 6
## title color rating reviews `Real Price (Rs~ `Discount Price (~
## <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Samsung Ga~ Gradatio~ 4.1 496 16490 14990
## 2 Samsung Ga~ Gradatio~ 4.1 1003 16490 14990
## 3 Redmi 6 Pro Black 4.2 6334 11499 8999
## 4 Redmi 6A Black 4.2 23230 6999 5999
## 5 Samsung Ga~ Ocean Bl~ 3.7 11616 13390 12990
## 6 Samsung Ga~ Gradatio~ 4.3 284 19590 17990
## 7 Samsung Ga~ Charcoal~ 4.1 2843 9290 8990
## 8 Redmi 6 Pro Black 4.1 14680 13499 10999
## 9 Mi Redmi Y2 Black 4.3 18039 10499 8999
## 10 OnePlus 6T Mirror B~ 4.5 14240 NA 37999
paths_allowed(
paths = c("https://www.imdb.com/search/title?groups=top_250&sort=user_rating")
)
##
www.imdb.com No encoding supplied: defaulting to UTF-8.
## [1] TRUE
imdb <- read_html("https://www.imdb.com/search/title?groups=top_250&sort=user_rating")
imdb
## {xml_document}
## <html xmlns:og="http://ogp.me/ns#" xmlns:fb="http://www.facebook.com/2008/fbml">
## [1] <head>\n<meta http-equiv="Content-Type" content="text/html; charset= ...
## [2] <body id="styleguide-v2" class="fixed">\n\n <img height=" ...
imdb %>%
html_nodes(".lister-item-content h3 a") %>%
html_text() -> movie_title
movie_title
## [1] "The Shawshank Redemption"
## [2] "The Godfather"
## [3] "The Dark Knight"
## [4] "The Godfather: Part II"
## [5] "The Lord of the Rings: The Return of the King"
## [6] "Pulp Fiction"
## [7] "Schindler's List"
## [8] "Il buono, il brutto, il cattivo"
## [9] "12 Angry Men"
## [10] "Inception"
## [11] "Fight Club"
## [12] "The Lord of the Rings: The Fellowship of the Ring"
## [13] "Forrest Gump"
## [14] "The Lord of the Rings: The Two Towers"
## [15] "The Matrix"
## [16] "Goodfellas"
## [17] "Star Wars: Episode V - The Empire Strikes Back"
## [18] "One Flew Over the Cuckoo's Nest"
## [19] "Shichinin no samurai"
## [20] "Interstellar"
## [21] "Cidade de Deus"
## [22] "Sen to Chihiro no kamikakushi"
## [23] "Saving Private Ryan"
## [24] "The Green Mile"
## [25] "La vita è bella"
## [26] "The Usual Suspects"
## [27] "Se7en"
## [28] "Léon"
## [29] "The Silence of the Lambs"
## [30] "Star Wars"
## [31] "It's a Wonderful Life"
## [32] "Andhadhun"
## [33] "Dangal"
## [34] "Spider-Man: Into the Spider-Verse"
## [35] "Avengers: Infinity War"
## [36] "Whiplash"
## [37] "The Intouchables"
## [38] "The Prestige"
## [39] "The Departed"
## [40] "The Pianist"
## [41] "Memento"
## [42] "Gladiator"
## [43] "American History X"
## [44] "The Lion King"
## [45] "Terminator 2: Judgment Day"
## [46] "Nuovo Cinema Paradiso"
## [47] "Hotaru no haka"
## [48] "Back to the Future"
## [49] "Raiders of the Lost Ark"
## [50] "Apocalypse Now"
imdb %>%
html_nodes(".lister-item-content h3 .lister-item-year") %>%
html_text() %>%
str_sub(start = 2, end = 5) %>%
as.Date(format = "%Y") %>%
year() -> movie_year
movie_year
## [1] 1994 1972 2008 1974 2003 1994 1993 1966 1957 2010 1999 2001 1994 2002
## [15] 1999 1990 1980 1975 1954 2014 2002 2001 1998 1999 1997 1995 1995 1994
## [29] 1991 1977 1946 2018 2016 2018 2018 2014 2011 2006 2006 2002 2000 2000
## [43] 1998 1994 1991 1988 1988 1985 1981 1979
imdb %>%
html_nodes(".lister-item-content p .certificate") %>%
html_text() -> movie_certificate
movie_certificate
## [1] "A" "A" "UA" "PG-13" "A" "A" "UA" "A"
## [9] "PG-13" "PG-13" "PG-13" "A" "A" "PG" "UA" "R"
## [17] "PG" "A" "A" "PG-13" "A" "R" "A" "A"
## [25] "U" "PG" "UA" "U" "U" "UA" "A" "UA"
## [33] "PG-13" "A" "R" "R" "R" "A" "U" "U"
## [41] "R" "U" "PG" "R"
imdb %>%
html_nodes(".lister-item-content p .runtime") %>%
html_text() %>%
str_split(" ") %>%
map_chr(1) %>%
as.numeric() -> movie_runtime
movie_runtime
## [1] 142 175 152 202 201 154 195 161 96 148 139 178 142 179 136 146 124
## [18] 133 207 169 130 125 169 189 116 106 127 110 118 121 130 139 161 117
## [35] 149 106 112 130 151 150 113 155 119 88 137 155 89 116 115 147
imdb %>%
html_nodes(".lister-item-content p .genre") %>%
html_text() %>%
str_trim() -> movie_genre
movie_genre
## [1] "Drama" "Crime, Drama"
## [3] "Action, Crime, Drama" "Crime, Drama"
## [5] "Adventure, Drama, Fantasy" "Crime, Drama"
## [7] "Biography, Drama, History" "Western"
## [9] "Drama" "Action, Adventure, Sci-Fi"
## [11] "Drama" "Adventure, Drama, Fantasy"
## [13] "Drama, Romance" "Adventure, Drama, Fantasy"
## [15] "Action, Sci-Fi" "Biography, Crime, Drama"
## [17] "Action, Adventure, Fantasy" "Drama"
## [19] "Adventure, Drama" "Adventure, Drama, Sci-Fi"
## [21] "Crime, Drama" "Animation, Adventure, Family"
## [23] "Drama, War" "Crime, Drama, Fantasy"
## [25] "Comedy, Drama, Romance" "Crime, Mystery, Thriller"
## [27] "Crime, Drama, Mystery" "Action, Crime, Drama"
## [29] "Crime, Drama, Thriller" "Action, Adventure, Fantasy"
## [31] "Drama, Family, Fantasy" "Crime, Thriller"
## [33] "Action, Biography, Drama" "Animation, Action, Adventure"
## [35] "Action, Adventure, Sci-Fi" "Drama, Music"
## [37] "Biography, Comedy, Drama" "Drama, Mystery, Sci-Fi"
## [39] "Crime, Drama, Thriller" "Biography, Drama, Music"
## [41] "Mystery, Thriller" "Action, Adventure, Drama"
## [43] "Drama" "Animation, Adventure, Drama"
## [45] "Action, Sci-Fi" "Drama"
## [47] "Animation, Drama, War" "Adventure, Comedy, Sci-Fi"
## [49] "Action, Adventure" "Drama, War"
imdb %>%
html_nodes(".ratings-bar .ratings-imdb-rating") %>%
html_attr("data-value") %>%
as.numeric() -> movie_rating
movie_rating
## [1] 9.3 9.2 9.0 9.0 8.9 8.9 8.9 8.9 8.9 8.8 8.8 8.8 8.8 8.7 8.7 8.7 8.7
## [18] 8.7 8.7 8.6 8.6 8.6 8.6 8.6 8.6 8.6 8.6 8.6 8.6 8.6 8.6 8.5 8.5 8.5
## [35] 8.5 8.5 8.5 8.5 8.5 8.5 8.5 8.5 8.5 8.5 8.5 8.5 8.5 8.5 8.5 8.5
imdb %>%
html_nodes(xpath = '//meta[@itemprop="ratingCount"]') %>%
html_attr('content') %>%
as.numeric() -> movie_votes
movie_votes
## [1] 2071204 1421091 2037123 986136 1474514 1619684 1073350 614714
## [9] 584933 1815880 1657428 1491094 1587637 1333527 1487841 894262
## [17] 1039292 821620 279794 1275388 637309 548835 1095312 999918
## [25] 544734 896953 1270408 912498 1117750 1108950 352574 38770
## [33] 118133 170890 615195 604715 665632 1052099 1063315 633051
## [41] 1020824 1197347 941247 822331 896944 198198 192417 922302
## [49] 802403 541876
Revenue not available for 31 and 47 movie number
imdb %>%
html_nodes(xpath = '//span[@name="nv"]') %>%
html_text() %>%
str_extract(pattern = "^\\$.*") %>%
na.omit() %>%
as.character() %>%
append(values = NA, after = 30) %>%
append(values = NA, after = 46) %>%
str_sub(start = 2, end = nchar(.) - 1) %>%
as.numeric() -> movie_revenue
movie_revenue
## [1] 28.34 134.97 534.86 57.30 377.85 107.93 96.07 6.10 4.36 292.58
## [11] 37.03 315.54 330.25 342.55 171.48 46.84 290.48 112.00 0.27 188.02
## [21] 7.56 10.06 216.54 136.80 57.60 23.34 100.13 19.50 130.74 322.74
## [31] NA 1.19 12.39 190.24 678.82 13.09 13.18 53.09 132.38 32.57
## [41] 25.54 187.71 6.72 312.90 204.84 11.99 NA 210.61 248.16 83.47
top_50 <- tibble(title = movie_title, release = movie_year,
`runtime (mins)` = movie_runtime, genre = movie_genre, rating = movie_rating,
votes = movie_votes, `revenue ($ millions)` = movie_revenue)
top_50
## # A tibble: 50 x 7
## title release `runtime (mins)` genre rating votes `revenue ($ mil~
## <chr> <dbl> <dbl> <chr> <dbl> <dbl> <dbl>
## 1 The Sha~ 1994 142 Drama 9.3 2.07e6 28.3
## 2 The God~ 1972 175 Crime,~ 9.2 1.42e6 135.
## 3 The Dar~ 2008 152 Action~ 9 2.04e6 535.
## 4 The God~ 1974 202 Crime,~ 9 9.86e5 57.3
## 5 The Lor~ 2003 201 Advent~ 8.9 1.47e6 378.
## 6 Pulp Fi~ 1994 154 Crime,~ 8.9 1.62e6 108.
## 7 Schindl~ 1993 195 Biogra~ 8.9 1.07e6 96.1
## 8 Il buon~ 1966 161 Western 8.9 6.15e5 6.1
## 9 12 Angr~ 1957 96 Drama 8.9 5.85e5 4.36
## 10 Incepti~ 2010 148 Action~ 8.8 1.82e6 293.
## # ... with 40 more rows
paths_allowed(
paths = c("https://www.similarweb.com/top-websites")
)
##
www.similarweb.com No encoding supplied: defaulting to UTF-8.
## [1] TRUE
top_sites <- read_html("https://www.similarweb.com/top-websites")
top_sites
## {xml_document}
## <html lang="en">
## [1] <head>\n<meta http-equiv="Content-Type" content="text/html; charset= ...
## [2] <body itemscope itemtype="http://schema.org/WebPage" data-backbone-c ...
top_sites %>%
html_node("table") %>%
html_table() -> ranking
ranking
## Rank \n \n Traffic rank of site, as compared to all other sites in the world (#1 being the site with the most traffic in the world)
## 1 1
## 2 2
## 3 3
## 4 4
## 5 5
## 6 6
## 7 7
## 8 8
## 9 9
## 10 10
## 11 11
## 12 12
## 13 13
## 14 14
## 15 15
## 16 16
## 17 17
## 18 18
## 19 19
## 20 20
## 21 21
## 22 22
## 23 23
## 24 24
## 25 25
## 26 26
## 27 27
## 28 28
## 29 29
## 30 30
## 31 31
## 32 32
## 33 33
## 34 34
## 35 35
## 36 36
## 37 37
## 38 38
## 39 39
## 40 40
## 41 41
## 42 42
## 43 43
## 44 44
## 45 45
## 46 46
## 47 47
## 48 48
## 49 49
## 50 50
## Website \n \n Domain, including all meaningful subdomains
## 1 google.com
## 2 youtube.com
## 3 facebook.com
## 4 baidu.com
## 5 instagram.com
## 6 twitter.com
## 7 pornhub.com
## 8 yahoo.com
## 9 xvideos.com
## 10 wikipedia.org
## 11 vk.com
## 12 xnxx.com
## 13 yandex.ru
## 14 amazon.com
## 15 live.com
## 16 ampproject.org
## 17 mail.ru
## 18 netflix.com
## 19 xhamster.com
## 20 ok.ru
## 21 reddit.com
## 22 yahoo.co.jp
## 23 qq.com
## 24 ebay.com
## 25 bing.com
## 26 google.com.br
## 27 linkedin.com
## 28 whatsapp.com
## 29 aliexpress.com
## 30 google.de
## 31 sogou.com
## 32 pinterest.com
## 33 office.com
## 34 msn.com
## 35 naver.com
## 36 google.co.jp
## 37 twitch.tv
## 38 paypal.com
## 39 google.fr
## 40 imdb.com
## 41 google.co.uk
## 42 craigslist.org
## 43 taobao.com
## 44 github.com
## 45 amazon.de
## 46 microsoft.com
## 47 amazon.co.jp
## 48 google.it
## 49 google.co.in
## 50 sm.cn
## Category \n \n SimilarWeb website categorization
## 1 Internet and Telecom > Search Engine
## 2 Arts and Entertainment > TV and Video
## 3 Internet and Telecom > Social Network
## 4 Internet and Telecom > Search Engine
## 5 Internet and Telecom > Social Network
## 6 Internet and Telecom > Social Network
## 7 Adult
## 8 News and Media
## 9 Adult
## 10 Reference > Dictionaries and Encyclopedias
## 11 Internet and Telecom > Social Network
## 12 Adult
## 13 Internet and Telecom > Search Engine
## 14 Shopping > General Merchandise
## 15 Internet and Telecom > Email
## 16 Computer and Electronics > Software
## 17 Internet and Telecom > Email
## 18 Arts and Entertainment > TV and Video
## 19 Adult
## 20 Internet and Telecom > Social Network
## 21 Internet and Telecom > Social Network
## 22 News and Media
## 23 News and Media
## 24 Shopping > General Merchandise
## 25 Internet and Telecom > Search Engine
## 26 Internet and Telecom > Search Engine
## 27 Internet and Telecom > Social Network
## 28 Internet and Telecom > Social Network
## 29 Shopping > General Merchandise
## 30 Internet and Telecom > Search Engine
## 31 Internet and Telecom > Search Engine
## 32 Internet and Telecom > Social Network
## 33 Computer and Electronics > Software
## 34 News and Media
## 35 News and Media
## 36 Internet and Telecom > Search Engine
## 37 Games > Video Games
## 38 Finance > Financial Management
## 39 Internet and Telecom > Search Engine
## 40 Arts and Entertainment > Movies
## 41 Internet and Telecom > Search Engine
## 42 Shopping > Classifieds
## 43 Shopping > General Merchandise
## 44 Computer and Electronics > Software
## 45 Shopping > General Merchandise
## 46 Computer and Electronics > Software
## 47 Shopping > General Merchandise
## 48 Internet and Telecom > Search Engine
## 49 Internet and Telecom > Search Engine
## 50 Business and Industry
## Change \n \n Change in ranking from previous month
## 1 =
## 2 =
## 3 =
## 4 =
## 5 =
## 6 =
## 7 =
## 8 =
## 9 =
## 10 =
## 11 =
## 12 +1
## 13 -1
## 14 =
## 15 =
## 16 =
## 17 =
## 18 =
## 19 =
## 20 +1
## 21 -1
## 22 =
## 23 =
## 24 =
## 25 =
## 26 +2
## 27 -1
## 28 +1
## 29 +1
## 30 +1
## 31 -4
## 32 =
## 33 =
## 34 =
## 35 =
## 36 +2
## 37 =
## 38 +2
## 39 +2
## 40 -4
## 41 -2
## 42 +4
## 43 +7
## 44 +5
## 45 -3
## 46 -2
## 47 +1
## 48 +4
## 49 -6
## 50 -3
## Avg. Visit Duration \n \n Average time spent by users on the website per visit
## 1 00:09:46
## 2 00:21:52
## 3 00:11:13
## 4 00:06:47
## 5 00:06:31
## 6 00:09:21
## 7 00:10:27
## 8 00:06:23
## 9 00:12:25
## 10 00:03:42
## 11 00:16:25
## 12 00:14:22
## 13 00:10:51
## 14 00:05:51
## 15 00:07:47
## 16 00:03:53
## 17 00:07:28
## 18 00:08:50
## 19 00:11:56
## 20 00:12:55
## 21 00:09:13
## 22 00:09:59
## 23 00:03:52
## 24 00:06:12
## 25 00:06:12
## 26 00:07:50
## 27 00:05:50
## 28 00:02:24
## 29 00:08:29
## 30 00:08:26
## 31 00:04:14
## 32 00:05:46
## 33 00:11:39
## 34 00:05:30
## 35 00:13:34
## 36 00:09:12
## 37 00:06:43
## 38 00:04:45
## 39 00:08:47
## 40 00:03:17
## 41 00:08:32
## 42 00:08:49
## 43 00:06:01
## 44 00:06:18
## 45 00:07:06
## 46 00:04:05
## 47 00:06:20
## 48 00:08:39
## 49 00:06:34
## 50 00:04:51
## Pages / Visit \n \n Average website pages viewed per visit
## 1 8.56
## 2 9.02
## 3 10.49
## 4 7.62
## 5 14.15
## 6 7.44
## 7 8.52
## 8 6.40
## 9 10.05
## 10 2.85
## 11 20.24
## 12 10.30
## 13 9.33
## 14 8.03
## 15 8.66
## 16 3.53
## 17 6.78
## 18 4.56
## 19 11.77
## 20 14.58
## 21 6.57
## 22 7.54
## 23 3.11
## 24 7.72
## 25 5.67
## 26 7.93
## 27 5.77
## 28 1.93
## 29 9.31
## 30 8.78
## 31 4.52
## 32 5.85
## 33 5.53
## 34 5.30
## 35 10.62
## 36 11.63
## 37 3.89
## 38 6.94
## 39 8.49
## 40 5.92
## 41 8.16
## 42 11.17
## 43 5.54
## 44 6.57
## 45 9.66
## 46 3.58
## 47 8.09
## 48 9.52
## 49 6.00
## 50 3.74
## Bounce Rate \n \n Percentage of visitors that view only 1 page on the website before exiting
## 1 30.89%
## 2 28.05%
## 3 30.64%
## 4 33.96%
## 5 35.58%
## 6 30.77%
## 7 26.47%
## 8 39.49%
## 9 27.73%
## 10 59.47%
## 11 25.17%
## 12 19.89%
## 13 25.67%
## 14 42.21%
## 15 18.33%
## 16 23.82%
## 17 34.96%
## 18 38.86%
## 19 25.04%
## 20 23.19%
## 21 42.69%
## 22 30.04%
## 23 55.77%
## 24 37.19%
## 25 45.08%
## 26 31.08%
## 27 35.61%
## 28 59.39%
## 29 33.89%
## 30 28.85%
## 31 41.15%
## 32 45.40%
## 33 34.54%
## 34 51.92%
## 35 24.19%
## 36 27.96%
## 37 33.87%
## 38 20.72%
## 39 31.06%
## 40 43.45%
## 41 30.14%
## 42 27.65%
## 43 39.67%
## 44 44.25%
## 45 31.85%
## 46 51.02%
## 47 39.73%
## 48 29.17%
## 49 38.53%
## 50 49.38%
col_names <- c("ranking", "domain", "category", "change",
"avg_time_on_site", "avg_pages_per_visit", "bounce_rate")
colnames(ranking) <- col_names
head(ranking)
## ranking domain category change
## 1 1 google.com Internet and Telecom > Search Engine =
## 2 2 youtube.com Arts and Entertainment > TV and Video =
## 3 3 facebook.com Internet and Telecom > Social Network =
## 4 4 baidu.com Internet and Telecom > Search Engine =
## 5 5 instagram.com Internet and Telecom > Social Network =
## 6 6 twitter.com Internet and Telecom > Social Network =
## avg_time_on_site avg_pages_per_visit bounce_rate
## 1 00:09:46 8.56 30.89%
## 2 00:21:52 9.02 28.05%
## 3 00:11:13 10.49 30.64%
## 4 00:06:47 7.62 33.96%
## 5 00:06:31 14.15 35.58%
## 6 00:09:21 7.44 30.77%
ranking %>%
count(category, sort = TRUE)
## # A tibble: 14 x 2
## category n
## <chr> <int>
## 1 Internet and Telecom > Search Engine 12
## 2 Internet and Telecom > Social Network 9
## 3 Shopping > General Merchandise 6
## 4 News and Media 5
## 5 Adult 4
## 6 Computer and Electronics > Software 4
## 7 Arts and Entertainment > TV and Video 2
## 8 Internet and Telecom > Email 2
## 9 Arts and Entertainment > Movies 1
## 10 Business and Industry 1
## 11 Finance > Financial Management 1
## 12 Games > Video Games 1
## 13 Reference > Dictionaries and Encyclopedias 1
## 14 Shopping > Classifieds 1
paths_allowed(
paths = c("https://en.wikipedia.org/wiki/List_of_Governors_of_Reserve_Bank_of_India")
)
##
en.wikipedia.org
## [1] TRUE
rbi_guv <- read_html("https://en.wikipedia.org/wiki/List_of_Governors_of_Reserve_Bank_of_India")
rbi_guv
## {xml_document}
## <html class="client-nojs" lang="en" dir="ltr">
## [1] <head>\n<meta http-equiv="Content-Type" content="text/html; charset= ...
## [2] <body class="mediawiki ltr sitedir-ltr mw-hide-empty-elt ns-0 ns-sub ...
rbi_guv %>%
html_nodes("table") %>%
html_table() %>%
extract2(2) -> profile
profile
## No. Officeholder Portrait Term start Term end
## 1 1 Osborne Smith NA 1 April 1935 30 June 1937
## 2 2 James Braid Taylor NA 1 July 1937 17 February 1943
## 3 3 C. D. Deshmukh NA 11 August 1943ii 30 May 1949
## 4 4 Benegal Rama Rau NA 1 July 1949 14 January 1957
## 5 5 K. G. Ambegaonkar NA 14 January 1957 28 February 1957
## 6 6 H. V. R. Iyengar NA 1 March 1957 28 February 1962
## 7 7 P. C. Bhattacharya NA 1 March 1962 30 June 1967
## 8 8 Lakshmi Kant Jha NA 1 July 1967 3 May 1970
## 9 9 B. N. Adarkar NA 4 May 1970 15 June 1970
## 10 10 Sarukkai Jagannathan NA 16 June 1970 19 May 1975
## 11 11 N. C. Sen Gupta NA 19 May 1975 19 August 1975
## 12 12 K. R. Puri NA 20 August 1975 2 May 1977
## 13 13 M. Narasimham NA 3 May 1977 30 November 1977
## 14 14 I. G. Patel NA 1 December 1977 15 September 1982
## 15 15 Manmohan Singh NA 16 September 1982 14 January 1985
## 16 16 Amitav Ghosh NA 15 January 1985 4 February 1985
## 17 17 R. N. Malhotra NA 4 February 1985 22 December 1990
## 18 18 S. Venkitaramanan NA 22 December 1990 21 December 1992
## 19 19 C. Rangarajan NA 22 December 1992 21 November 1997
## 20 20 Bimal Jalan NA 22 November 1997 6 September 2003
## 21 21 Y. Venugopal Reddy NA 6 September 2003 5 September 2008
## 22 22 D. Subbarao NA 5 September 2008 4 September 2013
## 23 23 Raghuram Rajan NA 4 September 2013 4 September 2016
## 24 24 Urjit Patel NA 4 September 2016 11 December 2018
## 25 25 Shaktikanta Das NA 12 December 2018 Incumbent
## Term in office Background
## 1 821 days Banker
## 2 2057 days Indian Civil Service (ICS) officer
## 3 2150 days ICS officer
## 4 2754 days ICS officer
## 5 45 days ICS officer
## 6 1825 days ICS officer
## 7 1947 days Indian Audit and Accounts Service officer
## 8 1037 days ICS officer
## 9 42 days Economist
## 10 1798 days ICS officer
## 11 92 days ICS officer
## 12 621 days
## 13 211 days Career Reserve Bank of India officer
## 14 1749 days Economist
## 15 851 days Economist
## 16 20 days Banker
## 17 2147 days Indian Administrative Service (IAS) officer
## 18 730 days IAS officer
## 19 1795 days Economist
## 20 2114 days Economist
## 21 1826 days IAS officer
## 22 1825 days IAS officer
## 23 1096 days Economist
## 24 946 days Economist
## 25 117 days IAS officer
## Prior office(s)
## 1 Managing Governor of the Imperial Bank of India
## 2 Deputy Governor of the Reserve Bank of India\n\nController of Currency
## 3 Deputy Governor of the Reserve Bank of India\nCustodian of Enemy Property
## 4 Ambassador of India to the United States\n\nAmbassador of India to Japan\n\nChairman of Bombay Port Trust
## 5 Finance Secretary
## 6 Chairman of the State Bank of India
## 7 Chairman of the State Bank of India\nSecretary in the Ministry of Finance
## 8 Secretary to the Prime Minister of India
## 9 Executive Director at the International Monetary Fund
## 10 Executive Director at the World Bank
## 11 Banking Secretary
## 12 Chairman and Managing Director of the Life Insurance Corporation
## 13 Deputy Governor of the Reserve Bank of India
## 14 Director of the London School of Economics\n\nDeputy Administrator of the United Nations Development Programme\nChief Economic Adviser to the Government of India
## 15 Secretary in the Ministry of Finance\n\nChief Economic Adviser to the Government of India
## 16 Deputy Governor of the Reserve Bank of India\n\nChairman of the Allahabad Bank
## 17 Finance Secretary\n\nExecutive Director at the International Monetary Fund
## 18 Finance Secretary
## 19 Deputy Governor of the Reserve Bank of India
## 20 Finance Secretary\n\nBanking Secretary\n\nChief Economic Adviser to the Government of India
## 21 Executive Director at the International Monetary Fund\n\nDeputy Governor of the Reserve Bank of India
## 22 Finance Secretary\n\nMember-Secretary of the Prime Minister's Economic Advisory Council
## 23 Chief Economic Adviser to the Government of India
## 24 Deputy Governor of the Reserve Bank
## 25 Member of the Fifteenth Finance Commission\nSherpa of India to the G20\nEconomic Affairs Secretary\nRevenue Secretary
## Reference(s)
## 1 [1]
## 2 [2]
## 3
## 4
## 5
## 6
## 7
## 8
## 9
## 10
## 11
## 12
## 13
## 14
## 15
## 16
## 17
## 18
## 19
## 20
## 21
## 22
## 23
## 24
## 25 [3][4][5]
profile %>%
separate(`Term in office`, into = c("term", "days")) %>%
select(Officeholder, term) %>%
arrange(desc(as.numeric(term)))
## Officeholder term
## 1 Benegal Rama Rau 2754
## 2 C. D. Deshmukh 2150
## 3 R. N. Malhotra 2147
## 4 Bimal Jalan 2114
## 5 James Braid Taylor 2057
## 6 P. C. Bhattacharya 1947
## 7 Y. Venugopal Reddy 1826
## 8 H. V. R. Iyengar 1825
## 9 D. Subbarao 1825
## 10 Sarukkai Jagannathan 1798
## 11 C. Rangarajan 1795
## 12 I. G. Patel 1749
## 13 Raghuram Rajan 1096
## 14 Lakshmi Kant Jha 1037
## 15 Urjit Patel 946
## 16 Manmohan Singh 851
## 17 Osborne Smith 821
## 18 S. Venkitaramanan 730
## 19 K. R. Puri 621
## 20 M. Narasimham 211
## 21 Shaktikanta Das 117
## 22 N. C. Sen Gupta 92
## 23 K. G. Ambegaonkar 45
## 24 B. N. Adarkar 42
## 25 Amitav Ghosh 20
profile %>%
count(Background)
## # A tibble: 9 x 2
## Background n
## <chr> <int>
## 1 "" 1
## 2 Banker 2
## 3 Career Reserve Bank of India officer 1
## 4 Economist 7
## 5 IAS officer 4
## 6 ICS officer 7
## 7 Indian Administrative Service (IAS) officer 1
## 8 Indian Audit and Accounts Service officer 1
## 9 Indian Civil Service (ICS) officer 1
profile %>%
pull(Background) %>%
fct_collapse(
Bureaucrats = c("IAS officer", "ICS officer",
"Indian Administrative Service (IAS) officer",
"Indian Audit and Accounts Service officer",
"Indian Civil Service (ICS) officer"),
`No Info` = c(""),
`RBI Officer` = c("Career Reserve Bank of India officer")
) %>%
fct_count() %>%
rename(background = f, count = n) -> backgrounds
backgrounds
## # A tibble: 5 x 2
## background count
## <fct> <int>
## 1 No Info 1
## 2 Banker 2
## 3 RBI Officer 1
## 4 Economist 7
## 5 Bureaucrats 14
backgrounds %>%
ggplot() +
geom_col(aes(background, count), fill = "blue") +
xlab("Background") + ylab("Count") +
ggtitle("Background of RBI Governors")